#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# -*- coding: utf-8 -*-

# Generates list of unicode ranges belonging to a set of categories, downloading
# Unicode data files as needed.
#
# Usage: genUnicodeTable.py
#
# To generate a new UnicodeData.inc file, the output of this script should be
# passed through clang-format, and then redirected:
#
# hermes/utils/genUnicodeTable.py | clang-format > hermes/lib/Platform/Unicode/UnicodeData.inc

import datetime
import hashlib
import sys
import urllib.request
from collections import defaultdict, OrderedDict
from functools import reduce
from itertools import islice
from string import Template
from textwrap import indent
from typing import Iterable, Optional


class UnicodeDataFiles:
    # VERSION = "UCD/latest"  # The bleeding edge version of Unicode.
    VERSION = "15.1.0"
    URLS = {
        "UnicodeData.txt": f"http://unicode.org/Public/{VERSION}/ucd/UnicodeData.txt",
        "SpecialCasing.txt": f"http://unicode.org/Public/{VERSION}/ucd/SpecialCasing.txt",
        "CaseFolding.txt": f"http://unicode.org/Public/{VERSION}/ucd/CaseFolding.txt",
        "DerivedGeneralCategory.txt": f"http://unicode.org/Public/{VERSION}/ucd/extracted/DerivedGeneralCategory.txt",
        "Scripts.txt": f"http://unicode.org/Public/{VERSION}/ucd/Scripts.txt",
        "ScriptExtensions.txt": f"http://unicode.org/Public/{VERSION}/ucd/ScriptExtensions.txt",
        "DerivedCoreProperties.txt": f"http://unicode.org/Public/{VERSION}/ucd/DerivedCoreProperties.txt",
        "DerivedNormalizationProps.txt": f"http://unicode.org/Public/{VERSION}/ucd/DerivedNormalizationProps.txt",
        "DerivedBinaryProperties.txt": f"http://unicode.org/Public/{VERSION}/ucd/extracted/DerivedBinaryProperties.txt",
        "PropertyValueAliases.txt": f"http://unicode.org/Public/{VERSION}/ucd/PropertyValueAliases.txt",
        "PropertyAliases.txt": f"http://unicode.org/Public/{VERSION}/ucd/PropertyAliases.txt",
        "PropList.txt": f"http://unicode.org/Public/{VERSION}/ucd/PropList.txt",
        "emoji-data.txt": f"http://unicode.org/Public/{VERSION}/ucd/emoji/emoji-data.txt",
    }
    # Set to True to keep the downloaded files in the local directory.
    KEEP_LOCAL_CACHE = False

    __cache = {}

    @classmethod
    def get(cls, filename):
        """Retrieve a Unicode data file, fetching it if necessary."""
        if filename not in cls.__cache:
            data = cls.__local_or_fetch(cls.URLS[filename], filename)
            cls.__cache[filename] = {
                "sha1": hashlib.sha1(data).hexdigest(),
                "lines": cls.__data_to_lines(data),
            }
        return cls.__cache[filename]

    @classmethod
    def __local_or_fetch(cls, url, filename) -> bytes:
        """Read a local file's contents or fetch them from a URL."""
        try:
            with open(filename, "rb") as f:
                print(f"Found {filename} locally!", file=sys.stderr)
                return f.read()
        except IOError:
            print(f"Fetching {url}...", file=sys.stderr)
            with urllib.request.urlopen(url) as f:
                data = f.read()
                if cls.KEEP_LOCAL_CACHE:
                    print(f"Caching {filename} locally...", file=sys.stderr)
                    with open(filename, "wb") as f:
                        f.write(data)
                return data

    @classmethod
    def __data_to_lines(cls, data) -> Iterable[str]:
        return [
            line
            for line in data.decode("utf-8").splitlines()
            if line and not line.startswith("#")
        ]

    @classmethod
    def get_lines(cls, filename) -> Iterable[str]:
        return cls.get(filename)["lines"]

    @classmethod
    def get_sha1(cls, filename) -> str:
        return cls.get(filename)["sha1"]


# Unicode data field indexes. See UnicodeData.txt.
CODEPOINT_FIELD = 0
GENERAL_CATEGORY_FIELD = 2
UPPERCASE_FIELD = 12
LOWERCASE_FIELD = 13


def print_template(s, **kwargs):
    """Substitute in the keyword arguments to the template string
    (or direct template) s, and print the result, followed by a
    newline.
    """
    text = Template(s).substitute(**kwargs)
    print(text.strip())
    print("")


def print_header():
    print_template(
        """
//
// File generated by genUnicodeTable.py
// using Unicode data files downloaded on ${today}
// for Unicode version ${version}
${sha1s}
// *** DO NOT EDIT BY HAND ***

/// An inclusive range of Unicode characters.
struct UnicodeRange { uint32_t first; uint32_t second; };

/// A UnicodeTransformRange expresses a mapping such as case folding.
/// A character cp is mapped to cp + delta if cp is 0 for the given modulus.
struct UnicodeTransformRange {
    /// The first codepoint of the range.
    unsigned start:24;

    /// The number of characters in the range.
    unsigned count:8;

    /// The signed delta amount.
    int delta:24;

    /// The modulo amount.
    unsigned modulo:8;
};

/// A reference to a string pool entry.
struct StringPoolRef {
  uint16_t offset;
  uint16_t size;
};

/// A reference to a UnicodeRange pool entry.
struct UnicodeRangePoolRef {
  uint16_t offset;
  uint16_t size;
};

/// A reference to a string pool name that maps to a string pool canonical name.
struct NameMapEntry {
    StringPoolRef name;
    StringPoolRef canonical;
};

/// A reference to a string pool name that maps to a range array pool offset
/// and size.
struct RangeMapEntry {
  StringPoolRef name;
  uint16_t rangeArrayPoolOffset;
  uint16_t rangeArraySize;
};
""",
        today=str(datetime.date.today()),
        sha1s="\n".join(
            f"// {filename:<30} SHA1: {UnicodeDataFiles.get_sha1(filename)}"
            for filename in UnicodeDataFiles.URLS.keys()
        ),
        version=UnicodeDataFiles.VERSION,
    )


def run_interval(unicode_data_lines, args):
    name = args[0]
    categories = set(args[1:])
    begin = 0
    intervals = []
    last_cp = 0
    openi = False
    for line in unicode_data_lines:
        fields = line.split(";")
        cp_str, category = fields[CODEPOINT_FIELD], fields[GENERAL_CATEGORY_FIELD]
        cp = int(cp_str, 16)
        if category in categories:
            if not openi:
                begin = cp
                openi = True
            else:
                pass  # do nothing we are still in interval
        else:
            if openi:
                intervals.append((begin, last_cp))
                openi = False
            else:
                pass  # keep looking
        last_cp = cp

    if openi:
        intervals.append((begin, last_cp))

    print_template(
        """
// ${args}
// static constexpr uint32_t ${name}_SIZE = $interval_count;
static constexpr UnicodeRange ${name}[] = {
${intervals}
};
    """,
        args=" ".join(args),
        name=name,
        interval_count=len(intervals),
        intervals="\n".join(
            "{" + hex(i[0]) + ", " + hex(i[1]) + "}," for i in intervals
        ),
    )


def print_categories(unicode_data_lines):
    """Output UnicodeRanges for Unicode General Categories."""
    categories = [
        "UNICODE_LETTERS Lu Ll Lt Lm Lo Nl",
        "UNICODE_COMBINING_MARK Mn Mc",
        "UNICODE_DIGIT Nd",
        "UNICODE_CONNECTOR_PUNCTUATION Pc",
    ]
    for cat in categories:
        run_interval(unicode_data_lines, cat.split())


def get_assigned_codepoints(unicode_data_lines):
    """Gather intervals for all assigned Unicode codepoints."""
    cp_begin = None
    cp_end = None

    def empty_buf():
        if cp_begin is not None:
            intervals.append((cp_begin, cp_begin if cp_end is None else cp_end))

    intervals = []
    lines = iter(unicode_data_lines)
    last_cp = 0

    while lines:
        line = next(lines, None)
        if line is None:
            break

        fields = split_fields(line)
        cp = int(fields[0], 16)
        # Handle UnicodeData.txt legacy codepoint ranges.
        # <https://www.unicode.org/reports/tr44/#Code_Point_Ranges>
        if fields[1].startswith("<") and fields[1].endswith("First>"):
            empty_buf()
            rng_begin = cp
            rng_end = int(split_fields(next(lines))[0], 16)
            intervals.append((rng_begin, rng_end))
            cp_begin = cp_end = None
        else:
            if cp - last_cp == 1:
                cp_end = cp
            else:
                if cp_begin is not None:
                    empty_buf()
                    cp_end = None
                cp_begin = cp

        last_cp = cp

    return intervals


def split_fields(line) -> list[str]:
    """
    Split a semicolon-separated line into fields, ignoring comments.
    """
    return [f.strip() for f in line.split("#")[0].split(";")]


# A Unicode codepoint range, represented as a tuple of (start, end).
range_tuple = tuple[int, int]


def parse_range(range_str) -> range_tuple:
    """
    Parse the alternative codepoint range format, e.g. "1..10", or "1" into a
    tuple of `(range_start, range_end)`.
    """
    start, end = range_str.split("..") if ".." in range_str else (range_str, range_str)
    return (int(start, 16), int(end, 16))


def parse_codepoint_ranges(lines: Iterable[str], pred) -> dict[str, list[range_tuple]]:
    """
    Create a mapping of canonical property names to lists of Unicode codepoint
    ranges (start, end) for those properties, from the lines of a Unicode
    Database data file.

    Codepoint ranges will be merged if they are adjacent.

    Example input:

        0000..001F    ; Cc # Cc       [32] <control-0000>-<control-001F>
        0020          ; Zs # Zs       [1] SPACE
    """
    ranges = defaultdict(list)
    last_name = None
    begin = 0
    last_cp = 0
    openi = False
    for line in lines:
        # Ignore empty lines and comment lines.
        if not line or line.startswith("#"):
            continue

        fields = split_fields(line)
        codepoint_range = fields[0]
        canonical_name = fields[1]
        if not pred(canonical_name):
            continue

        if last_name is None:
            last_name = canonical_name

        cp_start, cp_end = parse_range(codepoint_range)
        if last_name != canonical_name:
            # We have crossed over a property name boundary.
            if openi:
                ranges[last_name].append((begin, last_cp))
            openi = True
            last_name = canonical_name
            begin = cp_start
        else:
            if openi:
                if cp_start != last_cp + 1:
                    # We have crossed over an interval boundary.
                    ranges[last_name].append((begin, last_cp))
                    begin = cp_start
            else:
                begin = cp_start
                openi = True

        last_cp = cp_end

    if openi:
        ranges[last_name].append((begin, last_cp))

    return ranges


def parse_property_aliases(
    lines: Iterable[str], get_canonical_name
) -> dict[str, list[str]]:
    """
    Create a mapping of canonical property names to lists of aliases, from the
    lines of a Unicode Database data file.

    Example input:

        gc ; Cc                               ; Control                          ; cntrl
        gc ; Cf                               ; Format

    Example output:

        {
            "Cc": ["Control", "cntrl"],
            "Cf": ["Format"],
        }
    """
    property_aliases = {}
    for line in lines:
        fields = split_fields(line)
        canonical_name = get_canonical_name(fields)
        if canonical_name is not None:
            property_aliases[canonical_name] = fields[1:]
    return property_aliases


# A range pool entry, represented as a tuple of `(offset, size)`.
range_array_pool_entry = tuple[int, int]


class UnicodePropertyCategory:
    """
    A pool of property names and aliases, and codepoint range arrays, that exist
    with `UnicodeProperties` as the parent, and represent a specific category of
    Unicode properties, such as "General_Category" or "Script".
    """

    def __init__(self, parent=None):
        self.parent: UnicodeProperties = parent
        self._aliases: dict[str, list[str]] = {}
        self._range_array_pool: dict[str, range_array_pool_entry] = OrderedDict()

    def all_names(self) -> list[tuple[str, str]]:
        """
        Sorted list of `(alias, canonical_name)` tuples for all know property
        names in this pool.
        """
        # This is sorted so the C++ code can use a binary search.
        return sorted(
            {
                (alias, name)
                for name, aliases in self._aliases.items()
                for alias in aliases
            }
        )

    def range_array_pool(self):
        """
        Sorted list of `(name, (offset, size))` tuples for the range array pool
        data.
        """
        # This is sorted so the C++ code can use a binary search.
        return sorted(self._range_array_pool.items())

    def add_aliases(self, name, aliases):
        """
        See `UnicodeProperties.add_aliases`.
        """
        if name in self._aliases:
            raise ValueError(f"Duplicate name {name}")
        elif name not in aliases:
            raise ValueError(f"Canonical name {name} not in aliases")
        self._aliases[name] = aliases

        if self.parent is not None:
            self.parent.add_aliases(name, aliases)

    def mark_range_pool(
        self, category: str, name: str, ranges=None, offset=None, size=None
    ):
        """
        See `UnicodeProperties.mark_range_pool`.
        """
        if self.parent is not None:
            self.parent.mark_range_pool(category, name, ranges, offset, size)

    def mark_range_array_pool(self, name: str, canonical_names: list[str]):
        """
        For a compound property, mark the index and size of the compound
        property's ranges.

        For example: "C" is a compound property that refers to the ranges of the
        "Cc", "Cf", "Cn", "Co", and "Cs" properties.

        If all the canonical names are already in the range array pool, then
        this is an overlapping compound property, and the marked entry instead
        refers back to the existing pool entry, instead of adding a new one.
        """
        if name in self._range_array_pool:
            raise ValueError(f"Duplicate name {name}")

        size = len(canonical_names)

        if all(
            canonical_name in self._range_array_pool
            for canonical_name in canonical_names
        ):
            # This is an overlapping compound property, refer back to the first
            # existing pool entry, and do not increment the tracking index.
            offset = self._range_array_pool[canonical_names[0]][0]
            self._range_array_pool[name] = (
                offset,
                size,
            )
            return offset
        else:
            offset = self.parent._range_array_pool_index
            self._range_array_pool[name] = (offset, size)
            for canonical_name in canonical_names:
                self._range_array_pool[canonical_name] = (
                    self.parent._range_array_pool_index,
                    1,
                )
                self.parent._range_array_pool_index += 1
            return offset

    def mark_range_array_pool_manual(self, name: str, offset: int, size: int):
        """
        Like `mark_range_array_pool` but use manually provided offset and size.
        """
        if name in self._range_array_pool:
            raise ValueError(f"Duplicate name {name}")

        self._range_array_pool[name] = (offset, size)
        self.parent._range_array_pool_index += 1

    def get_range(self, category: str, name: str) -> Optional[range_array_pool_entry]:
        return self.parent._range_pool.get((category, name))


# Tuple of `(offset, (range_pool_index, ranges, size))` for a range pool entry.
range_pool_entry = tuple[int, tuple[int, Optional[list[range_tuple]], Optional[int]]]


class UnicodeProperties:
    """
    The parent pool for all Unicode property categories, which share common
    string, range, and range array pools. The point of the pools is to
    generate code that is able to efficiently reference the large amount of
    Unicode property data, by using indexes into shared pools.

    The string pool is a shared pool of all property names and aliases, which
    contains all canonical names and aliases.

    The range pool is a shared pool of all codepoint ranges, which is referenced
    by a canonical name.

    The range array pool is a shared pool of codepoint range arrays, which is
    referenced by a conical name, and refers to one or more ranges in the range
    pool.
    """

    INCLUDE_COMMENTS = True

    def __init__(self):
        self.general_category_pool = UnicodePropertyCategory(parent=self)
        self.binary_property_pool = UnicodePropertyCategory(parent=self)
        self.script_property_pool = UnicodePropertyCategory(parent=self)
        self.script_extensions_property_pool = UnicodePropertyCategory(parent=self)

        # All seen names, that comprise the shared string pool.
        self._names = set()

        # Mapping of `(category, name)` to `(offset, ranges)` for the range
        # pool, which is shared across the other property pools.
        #
        # The offset is used by the individual range array pools, to refer back
        # to the shared range pool, and the individual ranges are used to build
        # the shared range pool data.
        self._range_pool: dict[str, range_pool_entry] = OrderedDict()

        # Track the offset into the range pool, every time a new range is marked
        # the index is incremented by the size of the new range.
        self._range_pool_index = 0

        # Track the offset into the shared range array pool, every time a new
        # range array is added to one of the property pools, this is incremented
        # by 1.
        self._range_array_pool_index = 0
        self._metrics = defaultdict(lambda: 0)

    def log_metrics(self):
        print(
            f"""
string_offset_bits: {self._metrics['string_offset'].bit_length()}
string_size_bits: {self._metrics['string_size'].bit_length()}
range_pool_offset_bits: {self._metrics['range_pool_offset'].bit_length()}
range_pool_size_bits: {self._metrics['range_pool_size'].bit_length()}
range_array_pool_offset_bits: {self._metrics['range_array_pool_offset'].bit_length()}
range_array_pool_size_bits: {self._metrics['range_array_pool_size'].bit_length()}
              """,
            file=sys.stderr,
        )

    def add_aliases(self, name: str, aliases: list[str]):
        """
        Add a name and aliases to the shared string pool.
        """
        self._names.add(name)
        self._names.update(aliases)

    def mark_range_pool(
        self,
        category: str,
        name: str,
        ranges: list[int] = None,
        offset: int = None,
        size: int = None,
    ):
        """
        Mark a range pool entry, with optional codepoint ranges.

        The category is necessary to disambiguate in cases where the name itself
        may not be unique across properties, such as `Scripts` and
        `Script_Extensions` where they share names.
        """
        key = (category, name)
        if key in self._range_pool:
            raise ValueError(f"Duplicate key {key}")

        pool_offset = self._range_pool_index if offset is None else offset
        self._range_pool[key] = (pool_offset, ranges, size)
        if ranges:
            self._range_pool_index += len(ranges)

    def gather_general_category_properties(self):
        """
        Gather the aliases and codepoint ranges for General_Category properties,
        into the shared string and range pools.

        Example aliases input:

            gc ; Cc                               ; Control                          ; cntrl
            gc ; Cf                               ; Format

        Example codepoint input:

            00D8..00DE    ; Lu #   [7] LATIN CAPITAL LETTER O WITH STROKE..LATIN CAPITAL LETTER THORN
            0100          ; Lu #       LATIN CAPITAL LETTER A WITH MACRON
        """
        gc_property_aliases = parse_property_aliases(
            UnicodeDataFiles.get_lines("PropertyValueAliases.txt"),
            get_canonical_name=lambda fields: fields[1] if fields[0] == "gc" else None,
        )
        gc_property_ranges = parse_codepoint_ranges(
            UnicodeDataFiles.get_lines("DerivedGeneralCategory.txt"),
            lambda canonical_name: canonical_name in gc_property_aliases.keys(),
        )

        pool = self.general_category_pool

        # Update the string pool with the General_Category property names and
        # aliases.
        for name, aliases in gc_property_aliases.items():
            pool.add_aliases(name, aliases)

        # These General_Category properties are never directly associated with
        # codepoints, but exist conceptually as unions of other properties.
        #
        # NOTE: It's important that any ranges shared by compound groups overlap,
        #       so that the offset+size can be contiguous for each of them.
        #
        # <https://www.unicode.org/reports/tr44/#General_Category_Values>
        COMPOUND_GC_PROPERTIES = {
            "C": ["Cc", "Cf", "Cn", "Co", "Cs"],
            "L": ["Ll", "Lt", "Lu", "Lm", "Lo"],
            "LC": ["Ll", "Lt", "Lu"],
            "M": ["Mc", "Me", "Mn"],
            "N": ["Nd", "Nl", "No"],
            "P": ["Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps"],
            "S": ["Sc", "Sk", "Sm", "So"],
            "Z": ["Zl", "Zp", "Zs"],
        }

        cat = "General_Category"
        for compound_name, canonical_names in COMPOUND_GC_PROPERTIES.items():
            pool.mark_range_pool(cat, compound_name)

            for canonical_name in canonical_names:
                ranges = gc_property_ranges[canonical_name]
                if pool.get_range(cat, canonical_name) is None:
                    pool.mark_range_pool(cat, canonical_name, ranges)

            pool.mark_range_array_pool(compound_name, canonical_names)

        # Add any extra ranges that are not part of the compound groups.
        for canonical_name, ranges in gc_property_ranges.items():
            if pool.get_range(cat, canonical_name) is None:
                pool.mark_range_pool(cat, canonical_name, ranges)
                pool.mark_range_array_pool(canonical_name, [canonical_name])

    def gather_binary_properties(self):
        """
        Gather allowed binary (in the true/false sense) property aliases and
        codepoint ranges, explicitly given by ECMA262, into the shared string
        and range pools.

        <https://tc39.es/ecma262/multipage/text-processing.html#table-binary-unicode-properties>

        Example property aliases input:

            # Alias ; Canonical name  ; Additional alias

            AHex    ; ASCII_Hex_Digit
            Alpha   ; Alphabetic
            WSpace  ; White_Space     ; space
        """
        BINARY_PROPERTY_NAMES = [
            "ASCII",
            "ASCII_Hex_Digit",
            "Alphabetic",
            "Bidi_Control",
            "Bidi_Mirrored",
            "Case_Ignorable",
            "Cased",
            "Changes_When_Casefolded",
            "Changes_When_Casemapped",
            "Changes_When_Lowercased",
            "Changes_When_NFKC_Casefolded",
            "Changes_When_Titlecased",
            "Changes_When_Uppercased",
            "Dash",
            "Default_Ignorable_Code_Point",
            "Deprecated",
            "Diacritic",
            "Emoji",
            "Emoji_Component",
            "Emoji_Modifier",
            "Emoji_Modifier_Base",
            "Emoji_Presentation",
            "Extended_Pictographic",
            "Extender",
            "Grapheme_Base",
            "Grapheme_Extend",
            "Hex_Digit",
            "IDS_Binary_Operator",
            "IDS_Trinary_Operator",
            "ID_Continue",
            "ID_Start",
            "Ideographic",
            "Join_Control",
            "Logical_Order_Exception",
            "Lowercase",
            "Math",
            "Noncharacter_Code_Point",
            "Pattern_Syntax",
            "Pattern_White_Space",
            "Quotation_Mark",
            "Radical",
            "Regional_Indicator",
            "Sentence_Terminal",
            "Soft_Dotted",
            "Terminal_Punctuation",
            "Unified_Ideograph",
            "Uppercase",
            "Variation_Selector",
            "White_Space",
            "XID_Continue",
            "XID_Start",
        ]

        binary_property_aliases = {
            canonical_name: [] for canonical_name in BINARY_PROPERTY_NAMES
        }

        for line in UnicodeDataFiles.get_lines("PropertyAliases.txt"):
            fields = split_fields(line)
            canonical_name = fields[1]
            if canonical_name in binary_property_aliases:
                assert (
                    len(binary_property_aliases[canonical_name]) == 0
                ), "Duplicate canonical name"
                binary_property_aliases[canonical_name] = list(set(fields))

        is_known_name_or_alias = (
            lambda canonical_name: canonical_name in binary_property_aliases.keys()
        )
        binary_property_ranges = {
            # Used for binary properties such as `ASCII_Hex_Digit`.
            **parse_codepoint_ranges(
                UnicodeDataFiles.get_lines("PropList.txt"), is_known_name_or_alias
            ),
            # Used for general category properties such as `Cased_Letter`.
            **parse_codepoint_ranges(
                UnicodeDataFiles.get_lines("DerivedCoreProperties.txt"),
                is_known_name_or_alias,
            ),
            # Used for case folding properties such as `Changes_When_Casefolded`.
            **parse_codepoint_ranges(
                UnicodeDataFiles.get_lines("DerivedNormalizationProps.txt"),
                is_known_name_or_alias,
            ),
            # Used for binary properties such as `Bidi_Mirrored`.
            **parse_codepoint_ranges(
                UnicodeDataFiles.get_lines("DerivedBinaryProperties.txt"),
                is_known_name_or_alias,
            ),
            # Used for emoji-related binary properties such as
            # `Emoji_Presentation`.
            **parse_codepoint_ranges(
                UnicodeDataFiles.get_lines("emoji-data.txt"), is_known_name_or_alias
            ),
        }

        # Manually add cases that are not part of the enumerations.
        # <https://unicode.org/reports/tr18/#General_Category_Property>
        binary_property_aliases["ASCII"] = ["ASCII"]
        binary_property_ranges["ASCII"] = [(0x0, 0x7F)]

        binary_property_aliases["Any"] = ["Any"]
        binary_property_ranges["Any"] = [(0x0, 0x10FFFF)]

        binary_property_aliases["Assigned"] = ["Assigned"]
        binary_property_ranges["Assigned"] = get_assigned_codepoints(
            UnicodeDataFiles.get_lines("UnicodeData.txt")
        )

        pool = self.binary_property_pool

        # Update the string pool with the binary property names and aliases
        for name, aliases in binary_property_aliases.items():
            pool.add_aliases(name, aliases)

        cat = "Binary"
        for canonical_name, ranges in binary_property_ranges.items():
            if pool.get_range(cat, canonical_name) is None:
                pool.mark_range_pool(cat, canonical_name, ranges)
            pool.mark_range_array_pool(canonical_name, [canonical_name])

    def gather_script_properties(self):
        """
        Gather script and script extensions property aliases and codepoint
        ranges, as they exist in the Unicode Database, into the string and range
        pools.

        Script and script extensions are interleaved so that the ranges are
        contiguous.

        NOTE: Script extensions don't have their own names, instead they re-use
        the Script property names. However, the ranges are referenced by the
        alias, not the canonical name, which differs from how scripts are
        handled.

        Example property values aliases input:

            # Category ; Alias ; Canonical name

            sc         ; Arab  ; Arabic
            sc         ; Latn  ; Latin

        Example scripts input:

            0041..005A    ; Latin # L&  [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
            0600..0604    ; Arabic # Cf   [5] ARABIC NUMBER SIGN..ARABIC SIGN SAMVAT
        """
        script_property_aliases = parse_property_aliases(
            UnicodeDataFiles.get_lines("PropertyValueAliases.txt"),
            get_canonical_name=lambda fields: fields[2] if fields[0] == "sc" else None,
        )

        # This property is fictional, and is never directly referenced in the
        # codepoint data. Instead, Katakana (Kana) and Hiragana (Hira) are used
        # separately.
        #
        # <https://www.unicode.org/reports/tr44/#Allowed_Changes>
        del script_property_aliases["Katakana_Or_Hiragana"]

        script_property_ranges = parse_codepoint_ranges(
            UnicodeDataFiles.get_lines("Scripts.txt"),
            lambda canonical_name: canonical_name in script_property_aliases,
        )

        script_property_aliases_by_alias = parse_property_aliases(
            UnicodeDataFiles.get_lines("PropertyValueAliases.txt"),
            get_canonical_name=lambda fields: fields[1] if fields[0] == "sc" else None,
        )
        raw_property_ranges = parse_codepoint_ranges(
            UnicodeDataFiles.get_lines("ScriptExtensions.txt"), lambda _: True
        )
        # Because script extension codepoints are referenced by the script
        # property alias, not the canonical name, the ranges need to be manually
        # remapped.
        script_extensions_property_ranges = defaultdict(list)
        for key, ranges in raw_property_ranges.items():
            for short_key in key.split():
                # Script extension codepoints use the script property alias, not
                # the canonical name.
                canonical_name = script_property_aliases_by_alias[short_key][1]
                script_extensions_property_ranges[canonical_name].extend(ranges)

        pool = self.script_property_pool
        ext_pool = self.script_extensions_property_pool
        # Update the string pool with the script property names and aliases
        for name, aliases in script_property_aliases.items():
            pool.add_aliases(name, aliases)

        cat = "Script"
        ext_cat = "Script_Extensions"
        for canonical_name, ranges in script_property_ranges.items():
            if pool.get_range(cat, canonical_name) is None:
                pool.mark_range_pool(cat, canonical_name, ranges)
            script_range_array_offset = pool.mark_range_array_pool(
                canonical_name, [canonical_name]
            )

            # Script extensions are a superset of the script property ranges,
            # they are added immediately after the corresponding script so that
            # the ranges are contiguous.
            ext_ranges = script_extensions_property_ranges[canonical_name]
            if ext_ranges and ext_pool.get_range(ext_cat, canonical_name) is None:
                script_range_offset = pool.get_range(cat, canonical_name)[0]
                ext_pool.mark_range_pool(
                    ext_cat,
                    canonical_name,
                    ext_ranges,
                    # Start the range pool offset at the same offset as the
                    # corresponding script, and extend the size to include both
                    # the script and script extension ranges.
                    offset=script_range_offset,
                    size=len(ranges) + len(ext_ranges),
                )
                # Manually mark the range array pool entry for the script
                # extension. This is necessary because the script extension
                # needs to refer to offset for the script, and cover a range of
                # 2, but still only increment the range array pool index by 1.
                ext_pool.mark_range_array_pool_manual(
                    canonical_name,
                    script_range_array_offset,
                    # This is size 2 because it is the script range (1) script
                    # and the extension range (1).
                    2,
                )

        # Manually map the "Zzzz" / "Unknown" script property to the "Cn" /
        # "Unassigned" range.
        pool._range_array_pool["Unknown"] = (
            self.general_category_pool._range_array_pool["Cn"]
        )

    def print_template(self):
        """
        Produce the generated C++ code for the gathered Unicode properties data.

        This includes the string pool, range pool, and range array pool data.
        """
        all_strings = sorted(
            self._names,
            key=lambda name: (len(name), name),
            reverse=True,
        )
        string_pool = reduce(
            lambda acc, item: acc if item in acc else acc + item,
            all_strings,
            "",
        )

        def string_coord(name):
            """
            Build a string pool lookup reference for a given name.

            Example output:

                { offset, size }
            """
            offset = string_pool.index(name)
            size = len(name)
            self._metrics["string_offset"] = max(self._metrics["string_offset"], offset)
            self._metrics["string_size"] = max(self._metrics["string_size"], size)
            assert offset + size < 0xFFFF, "String pool offset+size exceeds uint16_t"
            return f"{{ {offset}, {size} }}"

        def _range_pool():
            """
            Using the range pool, generate the UnicodeRange entries for the C++
            code, that reference the shared range pool.

            Example output:

                static constexpr UnicodeRange UNICODE_RANGE_POOL[] = {
                    // General_Category: Cc
                    {0x0000, 0x001F},
                    {0x007F, 0x009F},
                };
            """
            for (cat, name), (offset, ranges, range_size) in self._range_pool.items():
                if self.INCLUDE_COMMENTS:
                    yield f"// {cat}: {name}"
                if ranges:
                    # Manually batch the ranges into 3 per line, because
                    # clang-format wants to format them to a single item per
                    # line.
                    for batch in batched(ranges, 3):
                        yield "".join(
                            f"{{{as_hex(start)}, {as_hex(end)}}},".ljust(20)
                            for start, end in batch
                        ).strip()

        def _range_array_pool():
            """
            Using the range array pool, generate the UnicodeRangePoolRef entries
            for the C++ code, that reference the shared range pool.

            Example output:

                static constexpr UnicodeRangePoolRef UNICODE_RANGE_ARRAY_POOL[] {
                    // General_Category: Cc
                    {0, 2},
                    // General_Category: Cf
                    {2, 21},
                };
            """
            for (cat, name), (offset, ranges, range_size) in self._range_pool.items():
                if self.INCLUDE_COMMENTS:
                    yield f"// {cat}: {name}"
                if ranges is not None:
                    size = len(ranges) if range_size is None else range_size
                    assert (
                        offset + size < 0xFFFF
                    ), "Range array offset+size exceeds uint16_t"
                    self._metrics["range_pool_offset"] = max(
                        self._metrics["range_pool_offset"], offset
                    )
                    self._metrics["range_pool_size"] = max(
                        self._metrics["range_pool_size"], size
                    )
                    yield f"{{ {offset}, {size} }},"

        def _build_name_map(pool: UnicodePropertyCategory):
            """
            For a given pool, build the NameMapEntry entries for the C++ code,
            that reference the shared string pool.

            Example output:

                static constexpr NameMapEntry canonicalPropertyNameMap_GeneralCategory[] = {
                    // "C", "C"
                    {{18, 1}, {18, 1}},
                    // "Cased_Letter", "LC"
                    {{1368, 12}, {3008, 2}},
                };
            """
            for alias, name in pool.all_names():
                if self.INCLUDE_COMMENTS:
                    yield f'// "{alias}", "{name}"'
                yield f"{{ {string_coord(alias)}, {string_coord(name)} }},"

        def _build_range_map(pool: UnicodePropertyCategory):
            """
            For a given pool, build the RangeMapEntry entries for the C++ code,
            that reference the shared range array pool.

            Example output:

                static constexpr RangeMapEntry unicodePropertyRangeMap_GeneralCategory[] = {
                    // "C"
                    {{18, 1}, 0, 5},
                    // "Cc"
                    {{3018, 2}, 0, 1},
                };
            """
            for name, (offset, size) in pool.range_array_pool():
                if self.INCLUDE_COMMENTS:
                    yield f'// "{name}"'
                assert (
                    offset + size < 0xFFFF
                ), "Range array map offset+size exceeds uint16_t"
                self._metrics["range_array_pool_offset"] = max(
                    self._metrics["range_array_pool_offset"], offset
                )
                self._metrics["range_array_pool_size"] = max(
                    self._metrics["range_array_pool_size"], size
                )
                yield f"{{ {string_coord(name)}, {offset}, {size} }},"

        print_template(
            """
#ifdef HERMES_ENABLE_UNICODE_REGEXP_PROPERTY_ESCAPES

static constexpr std::string_view UNICODE_DATA_STRING_POOL = "${string_pool}";

// clang-format off
static constexpr UnicodeRange UNICODE_RANGE_POOL[] = {
${range_pool}
};
// clang-format on

static constexpr UnicodeRangePoolRef UNICODE_RANGE_ARRAY_POOL[] {
${range_array_pool}
};

static constexpr NameMapEntry canonicalPropertyNameMap_GeneralCategory[] = {
${name_map_general_category}
};

static constexpr RangeMapEntry unicodePropertyRangeMap_GeneralCategory[] = {
${range_map_general_category}
};

static constexpr NameMapEntry canonicalPropertyNameMap_BinaryProperty[] = {
${name_map_binary_property}
};

static constexpr RangeMapEntry unicodePropertyRangeMap_BinaryProperty[] = {
${range_map_binary_property}
};

static constexpr NameMapEntry canonicalPropertyNameMap_Script[] = {
${name_map_script_property}
};

static constexpr RangeMapEntry unicodePropertyRangeMap_Script[] = {
${range_map_script_property}
};

static constexpr RangeMapEntry unicodePropertyRangeMap_ScriptExtensions[] = {
${range_map_script_extensions_property}
};

#endif
    """,
            string_pool=string_pool,
            range_pool=indent("\n".join(_range_pool()), "    "),
            range_array_pool="\n".join(_range_array_pool()),
            name_map_general_category="\n".join(
                _build_name_map(self.general_category_pool)
            ),
            range_map_general_category="\n".join(
                _build_range_map(self.general_category_pool)
            ),
            name_map_binary_property="\n".join(
                _build_name_map(self.binary_property_pool)
            ),
            range_map_binary_property="\n".join(
                _build_range_map(self.binary_property_pool)
            ),
            name_map_script_property="\n".join(
                _build_name_map(self.script_property_pool)
            ),
            range_map_script_property="\n".join(
                _build_range_map(self.script_property_pool)
            ),
            # NOTE: There is no canonical name mapping for Script_Extensions,
            # instead the one for Script is reused.
            range_map_script_extensions_property="\n".join(
                _build_range_map(self.script_extensions_property_pool)
            ),
        )


def stride_from(p1, p2):
    return p2[0] - p1[0]


def delta_within(p):
    return p[1] - p[0]


def as_hex(cp):
    return "0x%.4X" % cp


def batched(iterable, n):
    """
    Roughly equivalent to `itertools.batched` from Python 3.12, according to the
    Python3 documentation for batched.

    <https://docs.python.org/3/library/itertools.html#itertools.batched>

    >>> batched('ABCDEFG', 3) # ['ABC', 'DEF', 'G']
    """
    if n < 1:
        raise ValueError("n must be at least one")
    it = iter(iterable)
    while batch := tuple(islice(it, n)):
        yield batch


class DeltaMapBlock:
    def __init__(self):
        self.pairs = []

    def stride(self):
        return stride_from(self.pairs[0], self.pairs[1])

    def delta(self):
        return delta_within(self.pairs[0])

    def can_append(self, pair):
        if not self.pairs:
            return True
        if pair[0] - self.pairs[0][0] >= 256:
            return False
        if self.delta() != delta_within(pair):
            return False
        return len(self.pairs) < 2 or self.stride() == stride_from(self.pairs[-1], pair)

    @staticmethod
    def append_to_list(blocks, p):
        if not blocks or not blocks[-1].can_append(p):
            blocks.append(DeltaMapBlock())
        blocks[-1].pairs.append(p)

    def output(self):
        pairs = self.pairs
        if not pairs:
            return ""

        first = pairs[0][0]
        last = pairs[-1][0]
        modulo = self.stride() if len(pairs) >= 2 else 1
        delta = self.delta()
        code = Template("{$first, $count, $delta, $modulo}").substitute(
            first=as_hex(first), count=last - first + 1, delta=delta, modulo=modulo
        )
        return code.strip()


class CaseMap:
    """Unicode case mapping helper.

    This class holds the list of codepoints, and their uppercase and
    lowercase mappings.

    """

    def __init__(self, unicode_data_lines, special_casing_lines, casefolding_lines):
        """Construct with the lines from UnicodeData and SpecialCasing."""
        self.toupper = {}
        self.tolower = {}
        self.codepoints = []
        for line in unicode_data_lines:
            fields = line.split(";")
            self.__set_casemap(
                fields[CODEPOINT_FIELD],
                upper=fields[UPPERCASE_FIELD],
                lower=fields[LOWERCASE_FIELD],
            )
        self.codepoints.extend(self.toupper.keys())

        # Apply special cases. This is to support ES5.1 Canonicalize, which is
        # cast in terms of toUpperCase(). The desire here is to have a
        # locale-independent result. Thus we ignore SpecialCasing rules that
        # are locale specific. We can also get away with ignoring
        # context-sensitive rules because Canonicalize only considers one
        # character. Thus ignore any rules that have a condition.
        # Format is codepoint, lower, title, upper, condition
        for line in special_casing_lines:
            # Trim comments
            line = line.split("#")[0]
            fields = line.split(";")
            if len(fields) < 5:
                continue
            cps, lower, title, upper, condition = fields[:5]
            # Title is unused
            _ = title  # noqa: F841
            if not condition.strip():
                self.__set_casemap(cps, upper=upper, lower=lower)

        # Characters default to folding to themselves.
        self.folds = {cp: cp for cp in self.codepoints}

        # Parse case folds.
        for line in casefolding_lines:
            fields = line.split("#")[0].split(";")
            if len(fields) != 4:
                continue
            orig, status, folded, _ = map(str.strip, fields)
            # We are only interested in common and simple case foldings.
            if status not in ["C", "S"]:
                continue
            self.folds[int(orig, 16)] = int(folded, 16)

    def __set_casemap(self, cp, upper, lower):
        """Set a case mapping.

        Mark the upper and lower case forms of cp. If a form is empty,
        the character is its own case mapping.
        All parameters are code points encoded via hex into a string.

        """
        # Parse the codepoint from hex.
        cp = int(cp, 16)

        # "The simple uppercase is omitted in the data file if the uppercase
        # is the same as the code point itself."
        # The same is true for the lowercase.
        # Skip eszett or anything else that maps to more than one character.
        self.toupper[cp] = int(upper, 16) if upper and len(upper.split()) == 1 else cp
        self.tolower[cp] = int(lower, 16) if lower and len(lower.split()) == 1 else cp

    def canonicalize(self, ch, unicode):
        """Canonicalize a character per ES9 21.2.2.8.2."""
        if unicode:
            return self.folds[ch]
        else:
            upper_ch = self.toupper[ch]
            # "If u does not consist of a single character, return ch"
            # We only store 1-1 mappings.
            # "If ch's code unit value is greater than or equal to decimal 128
            # and cu's code unit value is less than decimal 128, then return ch"
            # That is, only ASCII may canonicalize to ASCII.
            if upper_ch < 128 and ch >= 128:
                return ch
            return upper_ch


def print_canonicalizations(casemap, unicode):
    blocks = []
    for cp in casemap.codepoints:
        # legacy does not decode surrogate pairs, so we can skip large code points.
        if not unicode and cp > 0xFFFF:
            continue
        canon_cp = casemap.canonicalize(cp, unicode)
        if cp != canon_cp:
            DeltaMapBlock.append_to_list(blocks, (cp, canon_cp))

    print_template(
        """
// static constexpr uint32_t ${name}_SIZE = ${entry_count};
static constexpr UnicodeTransformRange ${name}[] = {
${entry_text}
};
""",
        name="UNICODE_FOLDS" if unicode else "LEGACY_CANONS",
        entry_count=len(blocks),
        entry_text=",\n".join(b.output() for b in blocks),
    )


if __name__ == "__main__":
    print_header()

    print_categories(UnicodeDataFiles.get_lines("UnicodeData.txt"))

    unicode_properties = UnicodeProperties()
    unicode_properties.gather_general_category_properties()
    unicode_properties.gather_binary_properties()
    unicode_properties.gather_script_properties()
    unicode_properties.print_template()
    # Show information about bit sizes for the string and range pools.
    # unicode_properties.log_metrics()

    casemap = CaseMap(
        unicode_data_lines=UnicodeDataFiles.get_lines("UnicodeData.txt"),
        special_casing_lines=UnicodeDataFiles.get_lines("SpecialCasing.txt"),
        casefolding_lines=UnicodeDataFiles.get_lines("CaseFolding.txt"),
    )
    print_canonicalizations(casemap, unicode=True)
    print_canonicalizations(casemap, unicode=False)
